Pre-processing

  1. Adapters were trimmed using cutadapt v1.16
  2. Gene expression was quantified using salmon v1.3.0
  3. TPMs were obtained for the genes using tximport 1.20.0
library(dplyr)
library(ggplot2)
library(DESeq2)
## Warning: package 'GenomeInfoDb' was built under R version 4.1.1
## Warning: package 'MatrixGenerics' was built under R version 4.1.1
library(tximport)
library(readr)
library(tximportData)
library(readxl)
library(knitr)
library(tidyverse)
library(pheatmap)
library(RColorBrewer)
library(viridis)
library(ggrepel)
library(EnhancedVolcano)
library(fgsea)

R Markdown

Define Color Scheme and plot them


Plasma 1:10
Viridis 1:10
Cividis 1:10
Magma 1:10

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document.

##This section of code will generate a summary table of the samples sequenced and their sequencing and alignment metrics

Summary of Data Metrics
Sample patient reads %Q30 Duplication rate % reads with adapter STAR alignment number percent aligned splices annotated salmon mapping
Bulk HTB314 71816976 94.3 57 2.6 63614504 88.58 27340654 85.6967
Bulk MDS268 31441538 93.0 24 2.3 25549835 81.26 5546270 58.2075
Bulk MDS280 28794421 93.0 24 2.3 23317205 80.98 5115546 58.8614
CD123+ HTB314 61568500 94.0 56 2.7 55359277 89.91 22831187 86.4197
CD123+ MDS268 32307881 93.0 27 3.3 26243441 81.23 7290861 64.2150
CD123+ MDS280 23745205 93.0 34 2.6 21035358 88.59 5668653 71.3245
CD123- HTB314 83260626 94.0 58 2.6 74804722 89.84 31876605 87.6505
CD123- MDS268 27917999 93.0 26 2.7 22835189 81.79 5793322 63.9284
CD123- MDS280 24767573 93.0 32 2.6 22467937 90.72 4773823 62.7193

##This section of code will import and format the data for DeSeq2

#to generate a vector of names and file locations
files<-file.path("~/Desktop/Jordan files/Counts/salmon/salmon/", list.files("~/Desktop/Jordan files/Counts/salmon/salmon/"), "quant.sf")
names(files)<-list.files("~/Desktop/Jordan files/Counts/salmon/salmon")

#to call in a gene_map this was derived by pulling it out of the fasta file with a grep command for ENST and ENSG and pasting them together
gene_map=read_csv("~/Desktop/Jordan files/Counts/salmon/gene_map.csv", col_names=c('enstid', 'ensgid'))

# import transcript level counts
txi.salmon.t<-tximport(files, type="salmon", txOut=TRUE)
txi.salmon.g<-tximport(files=files, type="salmon", tx2gene= gene_map, ignoreTxVersion = TRUE, countsFromAbundance = 'lengthScaledTPM' )
### this code works but if I remove the ignoreTxVersion I get an error, this may have to do with how I am generating my tx2gene file

# Extract counts only
counts <- txi.salmon.g$counts %>%
  as.data.frame()
#Extract TPM
tpms <- data.frame(txi.salmon.g$abundance)

##for clients the counts and tpm files should be written out

##This chunk of code sets an expression threshold for TMP where all samples have at least 5 counts

expressed_genes<-tpms %>% filter_all(all_vars(. > 5))

##This chunk of code generates a heatmap using the spearman method as well as a correlation heatmap ## PCA plot

exp.pca <- prcomp(t(log2(expressed_genes)))
exp.pca.summary <- summary(exp.pca)$importance
pc1var = round(exp.pca.summary[3,1] * 100, 1)
pc2var = round(exp.pca.summary[3,2] * 100 - pc1var, 1)
exp.pca.pc <- data.frame(exp.pca$x, sample = colnames(expressed_genes))

PC1 vs PC2

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   SampleName = col_character(),
##   FileName = col_character(),
##   patient = col_character(),
##   cellType = col_character()
## )

## 3. Run Differential Expression testing using DESeq2 and Calculate Gene Set Enrichment ## Compare 123pos vs 123neg, 123neg vs bulk, and 123pos vs bulk #### sig = padj <0.01 and abs(l2fc) >0.5 ####

coldata<-dplyr::select(metadata, -FileName) 
coldata<-column_to_rownames(coldata, 'SampleName')
## need to confirm that all names are in the same order
all(rownames(coldata) %in% colnames(txi.salmon.g$counts))
## [1] TRUE
all(rownames(coldata) == colnames(txi.salmon.g$counts))
## [1] FALSE
coldata<-coldata[colnames(txi.salmon.g$counts),]
all(rownames(coldata) == colnames(txi.salmon.g$counts))
## [1] TRUE
dds <- DESeqDataSetFromTximport(txi.salmon.g,
                              colData = coldata,
                              design = ~ patient + cellType)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## using just counts from tximport
### unfiltered data
dds.unfiltered <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res.unfiltered <- results( dds.unfiltered)

##filtered data changed this from dds[rowMins(counts(dds))>5,] to a less stringent filtering
keep<-rowSums(counts(dds))>=10
#dds.filtered<-dds[rowMins(counts(dds))>5,]
dds.filtered<-dds[keep,]
dds.filtered<-DESeq(dds.filtered)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res.filtered<-results(dds.filtered)

###outcome the first filtering cut from 60000 genese to 14000 genes with the new filter 35000genes are left which is about 1/2
### need to subset to do pairwise comparison unclear if this keeps the patient design aspect


res_123posvs123neg_unfiltered<-results( dds.unfiltered, contrast=c("cellType", "123pos", "123neg"))
res_123posvsBulk_unfiltered<-results( dds.unfiltered, contrast=c("cellType", "123pos", "bulk"))
res_123negvsBulk_unfiltered<-results( dds.unfiltered, contrast=c("cellType", "123neg", "bulk"))

res_123posvs123neg_filtered<-results(dds.filtered, contrast=c("cellType", "123pos", "123neg"))
res_123posvsBulk_filtered<-results(dds.filtered, contrast=c("cellType", "123pos", "bulk"))
res_123negvsBulk_filtered<-results(dds.filtered, contrast=c("cellType", "123neg", "bulk"))

## look at the numbers of genes meeting threshold the log fold change call is not changing things
sum( res_123posvs123neg_unfiltered$pvalue < 0.01 & abs(res_123posvs123neg_unfiltered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 1405
sum( res_123posvsBulk_unfiltered$pvalue < 0.01 & abs(res_123posvsBulk_unfiltered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 2382
sum(res_123negvsBulk_unfiltered$pvalue < 0.01 & abs(res_123negvsBulk_unfiltered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 2336
sum( res_123posvs123neg_filtered$pvalue < 0.01 & abs(res_123posvs123neg_filtered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 1206
sum( res_123posvsBulk_filtered$pvalue < 0.01 & abs(res_123posvsBulk_filtered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 2157
sum(res_123negvsBulk_filtered$pvalue < 0.01 & abs(res_123negvsBulk_filtered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 2130
sum(res_123posvs123neg_filtered$padj < 0.1, na.rm=TRUE)
## [1] 1000
sum(res_123posvs123neg_unfiltered$padj < 0.1, na.rm=TRUE)
## [1] 1180

Volcano Plot

## Warning: Removed 19307 rows containing missing values (geom_point).

## Warning: Removed 20003 rows containing missing values (geom_point).

## Warning: Removed 20003 rows containing missing values (geom_point).

###MA plots

pca_data<-vst(dds, blind=T)
ntop=500
rv <- rowVars(assay(pca_data))
select <- order(rv, decreasing=TRUE)[seq_len(min(ntop, length(rv)))]
pca <- prcomp(t(assay(pca_data)[select,]))
percentVar2 <- data.frame(percentVar=pca$sdev^2/sum(pca$sdev^2))%>%
  mutate(pc=1:n())%>%dplyr::select(pc, percentVar)
pca_df<-pca$x%>%data.frame()%>%mutate(type=metadata$cellType)

alt_col_values=c("#88CCEE", "#CC6677", "#DDCC77")

PC1 vs PC2

pca_data2<-vst(dds.filtered, blind=T)
ntop=500
rv <- rowVars(assay(pca_data2))
select <- order(rv, decreasing=TRUE)[seq_len(min(ntop, length(rv)))]
pca2 <- prcomp(t(assay(pca_data2)[select,]))
percentVar3 <- data.frame(percentVar=pca2$sdev^2/sum(pca2$sdev^2))%>%
  mutate(pc=1:n())%>%dplyr::select(pc, percentVar)
pca_df2<-pca2$x%>%data.frame()%>%mutate(type=metadata$cellType)

alt_col_values=c("#88CCEE", "#CC6677", "#DDCC77")

PC1 vs PC2

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

###GSEA analysis

##generate gene names to go with ensembl gene id as the pathways will use gene name
mart <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl", host='www.ensembl.org')
t2g_hs <- biomaRt::getBM(attributes = c('ensembl_transcript_id', 'ensembl_gene_id', 'external_gene_name', 'refseq_mrna'), mart = mart)
ens2gene <- t2g_hs[,c(2,3)]
colnames(ens2gene)[2] <- 'Gene'
ens2gene <- unique(ens2gene)

##loading hallmark pathways and KEGG pathways

pathways.hallmark <- gmtPathways("~/Desktop/Jordan files/h.all.v7.4.symbols.gmt")
pathways.kegg<-gmtPathways("~/Desktop/Jordan files/c2.cp.kegg.v7.4.symbols.gmt")

##generating tidy data and adding it to the results file
##123pos vs 123neg analysis
res_posneg_gsea<-results(dds.filtered, contrast=c("cellType", "123pos", "123neg"),tidy=TRUE)
colnames(res_posneg_gsea)[1]<-"ensembl_gene_id"
res_posneg_gsea <- inner_join(res_posneg_gsea,ens2gene,by="ensembl_gene_id")
res_posneg_gsea2<-res_posneg_gsea%>%
  dplyr::select(Gene, stat) %>%
  na.omit() %>%
  distinct() %>%
  group_by(Gene) %>%
  summarise(stat=mean(stat))
rank_posneg<-deframe(res_posneg_gsea2)
fgseaRes_posneg <- fgsea(pathways=pathways.hallmark, stats=rank_posneg)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (1.06% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
fgseaRes_posneg_kegg<-fgsea(pathways=pathways.kegg, stats=rank_posneg)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (1.06% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
fgseaRes_posneg_tidy <-fgseaRes_posneg%>%
  as_tibble() %>%
  arrange(desc(NES))

fgseaRes_posneg_kegg_tidy <-fgseaRes_posneg_kegg%>%
  as_tibble() %>%
  arrange(desc(NES))

##123 pos vs bulk analysis

res_posbulk_gsea<-results(dds.filtered, contrast=c("cellType", "123pos", "bulk"),tidy=TRUE)
colnames(res_posbulk_gsea)[1]<-"ensembl_gene_id"
res_posbulk_gsea <- inner_join(res_posbulk_gsea,ens2gene,by="ensembl_gene_id")

res_posbulk_gsea2<-res_posbulk_gsea%>%
  dplyr::select(Gene, stat) %>%
  na.omit() %>%
  distinct() %>%
  group_by(Gene) %>%
  summarise(stat=mean(stat))
rank_posbulk<-deframe(res_posbulk_gsea2)
fgseaRes_posbulk<- fgsea(pathways=pathways.hallmark, stats=rank_posbulk)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (1.26% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
fgseaRes_posbulk_kegg<-fgsea(pathways=pathways.kegg, stats=rank_posbulk)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (1.26% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
fgseaRes_posbulk_tidy <-fgseaRes_posbulk%>%
  as_tibble() %>%
  arrange(desc(NES))

fgseaRes_posbulk_kegg_tidy <-fgseaRes_posbulk_kegg%>%
  as_tibble() %>%
  arrange(desc(NES))

##123 neg vs bulk analysis
res_negbulk_gsea<-results(dds.filtered, contrast=c("cellType", "123neg", "bulk"),tidy=TRUE)
colnames(res_negbulk_gsea)[1]<-"ensembl_gene_id"
res_negbulk_gsea <- inner_join(res_negbulk_gsea,ens2gene,by="ensembl_gene_id")


res_negbulk_gsea2<-res_negbulk_gsea%>%
  dplyr::select(Gene, stat) %>%
  na.omit() %>%
  distinct() %>%
  group_by(Gene) %>%
  summarise(stat=mean(stat))
rank_negbulk<-deframe(res_negbulk_gsea2)
fgseaRes_negbulk<- fgsea(pathways=pathways.hallmark, stats=rank_negbulk)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (1.4% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
fgseaRes_negbulk_kegg<-fgsea(pathways=pathways.kegg, stats=rank_negbulk)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (1.4% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
fgseaRes_negbulk_tidy <-fgseaRes_negbulk%>%
  as_tibble() %>%
  arrange(desc(NES))

fgseaRes_negbulk_kegg_tidy <-fgseaRes_negbulk_kegg%>%
  as_tibble() %>%
  arrange(desc(NES))
fgseaRes_posneg_tidy%>%
  dplyr::select(-leadingEdge, -ES) %>%
  arrange(padj) %>%
  DT::datatable(caption = 'Table 1: Hallmark genes with 123pos vs 123neg.')
fgseaRes_posneg_kegg_tidy%>%
  dplyr::select(-leadingEdge, -ES) %>%
  arrange(padj) %>%
  DT::datatable(caption = 'Table 1: KEGG pathways with 123pos vs 123neg.')
fgseaRes_posbulk_tidy%>%
  dplyr::select(-leadingEdge, -ES) %>%
  arrange(padj) %>%
  DT::datatable(caption = 'Table 1: Hallmark genes with 123pos vs bulk.')
fgseaRes_posbulk_kegg_tidy%>%
  dplyr::select(-leadingEdge, -ES) %>%
  arrange(padj) %>%
  DT::datatable(caption = 'Table 1: KEGG pathways with 123pos vs bulk.')
fgseaRes_negbulk_tidy%>%
  dplyr::select(-leadingEdge, -ES) %>%
  arrange(padj) %>%
  DT::datatable(caption = 'Table 1: Hallmark genes with 123neg vs bulk.')
fgseaRes_posneg_kegg_tidy%>%
  dplyr::select(-leadingEdge, -ES) %>%
  arrange(padj) %>%
  DT::datatable(caption = 'Table 1: KEGG pathways with 123neg vs bulk.')
ggplot(fgseaRes_posneg_tidy, aes(reorder(pathway, NES), NES)) +
  geom_col(aes(fill=padj<0.1)) +
  coord_flip() +
  labs(x="Pathway", y="Normalized Enrichment Score",
       title="Hallmark pathways NES from GSEA 123pos vs 123neg") + 
  theme_minimal()

ggplot(fgseaRes_posneg_kegg_tidy, aes(reorder(pathway, NES), NES)) +
  geom_col(aes(fill=padj<0.1)) +
  coord_flip() +
  labs(x="Pathway", y="Normalized Enrichment Score",
       title="Kegg pathways NES from GSEA 123pos vs 123neg") + 
  theme_minimal()

ggplot(fgseaRes_posbulk_tidy, aes(reorder(pathway, NES), NES)) +
  geom_col(aes(fill=padj<0.1)) +
  coord_flip() +
  labs(x="Pathway", y="Normalized Enrichment Score",
       title="Hallmark pathways NES from GSEA 123pos vs bulk") + 
  theme_minimal()

ggplot(fgseaRes_posbulk_kegg_tidy, aes(reorder(pathway, NES), NES)) +
  geom_col(aes(fill=padj<0.1)) +
  coord_flip() +
  labs(x="Pathway", y="Normalized Enrichment Score",
       title="Kegg pathways NES from GSEA 123pos vs bulk") + 
  theme_minimal()

ggplot(fgseaRes_negbulk_tidy, aes(reorder(pathway, NES), NES)) +
  geom_col(aes(fill=padj<0.1)) +
  coord_flip() +
  labs(x="Pathway", y="Normalized Enrichment Score",
       title="Hallmark pathways NES from GSEA 123neg vs bulk") + 
  theme_minimal()

ggplot(fgseaRes_negbulk_kegg_tidy, aes(reorder(pathway, NES), NES)) +
  geom_col(aes(fill=padj<0.1)) +
  coord_flip() +
  labs(x="Pathway", y="Normalized Enrichment Score",
       title="Kegg pathways NES from GSEA 123neg vs bulk ") + 
  theme_minimal()